##################################################################
##################################################################
## Replication Material
## Stefan Müller: The Temporal Focus of Campaign Communication
## The Journal of Politics
## stefan.mueller@ucd.ie
##
## Script 5: Results reported in SI Section C
##################################################################
##################################################################

# Section B did not include any empirical analyses, 
# but simply described the data. Therefore, there is no
# replication script for SI Section B

# Note: The file description_replication_material_jop_mueller.pdf describes the purpose of this 
# file in detail and lists the names and sources of all datasets 
# used in this script

# This script was run on the following R version, platform and OS:
# R version 3.6.0 (2019-04-26)
# Platform: Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS Catalima 10.15.5

# load packages required to run this script
library(quanteda)             # CRAN v2.0.1
library(quanteda.textmodels)  # CRAN v0.9.1
library(quanteda.classifiers) # [github::quanteda/quanteda.classifiers] v0.3 
library(ggplot2)              # CRAN v3.3.2
library(caret)                # CRAN v6.0-86
library(xtable)               # CRAN v1.8-4
library(Hmisc)                # CRAN v4.4-0
library(stringr)              # CRAN v1.4.0
library(tidyr)                # CRAN v1.1.0
library(dplyr)                # CRAN v1.0.0

# Note that the quanteda.classifiers package (which contains the MLP classifier)
# was still under development while writing this paper.
# Yet, this classifier or package is not required to reproduce any analysis from the main paper. 
# Please contact the author if you have any questions.

# create custom ggplot2 scheme
theme_baser <- function (){
  theme_minimal()  %+replace%
    theme(panel.grid.minor.x = element_blank(),
          panel.grid.minor.y = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.border = element_rect(fill = NA,color = "black", size = 0.5,
                                      linetype = "solid"),
          legend.title = element_text(size = 15),
          plot.title = element_text(size = 15, face = "italic",
                                    vjust = 1.5, hjust = 0,
                                    margin=margin(0, 0, 12 ,0)),
          legend.position = "bottom",
          axis.ticks = element_line(size = 0.3),
          axis.ticks.length = unit(0.2, "cm"),
          legend.text=element_text(size = 13),
          strip.text = element_text(size = 15, hjust = 0.5,
                                    margin = margin(b = 5, r = 5, l = 5, t = 5)),
          axis.text = element_text(colour = "black", size = 13),
          axis.title = element_text(size = 13, hjust = 0.5))
}

# set theme
theme_set(theme_baser())

# load English and German datasets with human-annotated sentences

dat_full_en <- readRDS("data_sentences_classified_english.rds")

dat_full_de <- readRDS("data_sentences_classified_german.rds")


# k-fold cross-validation for English corpus

corp_en <- corpus(dat_full_en, text_field = "text")

# reshuffle corpus/order of sentences
set.seed(135)
corp_en <- corpus_sample(corp_en,
                         size = NULL,
                         replace = FALSE,
                         prob = NULL,
                         by = NULL)

toks_en <- tokens(corp_en)

                              # create 4 equally size folds
docvars(toks_en, "fold") <- cut(seq(1, ndoc(toks_en)), 
                                breaks = 4, labels = FALSE)

conf_folds_en <- data.frame()

# perform 4-fold cross validation
for(i in 1:4){
  
  dfmat_train <- toks_en %>% 
    tokens_subset(fold != i) %>% 
    dfm(remove_punct = TRUE)
  
  dfmat_test <- toks_en %>% 
    tokens_subset(fold == i) %>% 
    dfm(remove_punct = TRUE)
  
  # train and predict all models
  tmod_nb <- textmodel_nb(dfmat_train, docvars(dfmat_train, "class"))
  tmod_mlp <- quanteda.classifiers::textmodel_mlp(dfmat_train, docvars(dfmat_train, "class")) # might change since a development version is used
  tmod_svm <- textmodel_svm(dfmat_train, docvars(dfmat_train, "class"))
  
  pred_tmod_nb <- predict(tmod_nb, dfmat_test, force = TRUE)
  pred_tmod_mlp <- predict(tmod_mlp, dfmat_test, force = TRUE)
  pred_tmod_svm <- predict(tmod_svm, dfmat_test, force = TRUE)
  
  
                              # create cross-tabulations for performance
  tab_nb <- (table(pred_tmod_nb,
                   docvars(dfmat_test, "class")))
  
  if(nrow(tab_nb)!=ncol(tab_nb)){
    
    missings <- setdiff(colnames(tab_nb),rownames(tab_nb))
    
    missing_mat <- mat.or.vec(nr = length(missings), nc = ncol(tab_nb))
    tab_nb  <- as.table(rbind(as.matrix(tab_nb), missing_mat))
    rownames(tab_nb) <- colnames(tab_nb)
  }
  
  tab_nb <- caret::confusionMatrix(tab_nb)
  
  tab_mlp <- table(pred_tmod_mlp,
                   docvars(dfmat_test, "class"))
  
  if(nrow(tab_mlp)!=ncol(tab_mlp)){
    
    missings <- setdiff(colnames(tab_mlp),rownames(tab_mlp))
    
    missing_mat <- mat.or.vec(nr = length(missings), nc = ncol(tab_mlp))
    tab_mlp  <- as.table(rbind(as.matrix(tab_mlp), missing_mat))
    rownames(tab_mlp) <- colnames(tab_mlp)
  }
  
  tab_mlp <- caret::confusionMatrix(tab_mlp)
  
  
  tab_svm <- table(pred_tmod_svm,
                   docvars(dfmat_test, "class"))
  
  
  if(nrow(tab_svm)!=ncol(tab_svm)){
    
    missings <- setdiff(colnames(tab_svm),rownames(tab_svm))
    
    missing_mat <- mat.or.vec(nr = length(missings), nc = ncol(tab_svm))
    tab_svm  <- as.table(rbind(as.matrix(tab_svm), missing_mat))
    rownames(tab_svm) <- colnames(tab_svm)
  }
  
  tab_svm <- caret::confusionMatrix(tab_svm)
  
  conf_nb <- as.data.frame(tab_nb$byClass[, "F1"])
  conf_prec_nb <- as.data.frame(tab_nb$byClass[, "Precision"])
  conf_rec_nb <- as.data.frame(tab_nb$byClass[, "Recall"])
  
  conf_nb$class <- rownames(conf_nb)
  conf_nb$F1 <- conf_nb$`tab_nb$byClass[, "F1"]`
  conf_nb$Precision <- conf_prec_nb$`tab_nb$byClass[, "Precision"]`
  conf_nb$Recall <- conf_rec_nb$`tab_nb$byClass[, "Recall"]`
  
  conf_nb$classifier <- "Naive Bayes"
  
  conf_svm <- as.data.frame(tab_svm$byClass[, "F1"])
  conf_prec_svm <- as.data.frame(tab_svm$byClass[, "Precision"])
  conf_rec_svm <- as.data.frame(tab_svm$byClass[, "Recall"])
  
  conf_svm$class <- rownames(conf_svm)
  conf_svm$F1 <- conf_svm$`tab_svm$byClass[, "F1"]`
  conf_svm$Precision <- conf_prec_svm$`tab_svm$byClass[, "Precision"]`
  conf_svm$Recall <- conf_rec_svm$`tab_svm$byClass[, "Recall"]`
  
  conf_svm$classifier <- "SVM"
  
  conf_mlp <- as.data.frame(tab_mlp$byClass[, "F1"])
  conf_prec_mlp <- as.data.frame(tab_mlp$byClass[, "Precision"])
  conf_rec_mlp <- as.data.frame(tab_mlp$byClass[, "Recall"])
  
  conf_mlp$class <- rownames(conf_mlp)
  conf_mlp$F1 <- conf_mlp$`tab_mlp$byClass[, "F1"]`
  conf_mlp$Precision <- conf_prec_mlp$`tab_mlp$byClass[, "Precision"]`
  conf_mlp$Recall <- conf_rec_mlp$`tab_mlp$byClass[, "Recall"]`
  
  conf_mlp$classifier <- "MLP"
  
  conf <- bind_rows(conf_nb,
                    conf_svm,
                    conf_mlp
  )
  conf$fold <- paste0("Fold: ", i)
  
  conf_folds_en <- bind_rows(conf, conf_folds_en)
}

head(conf_folds_en)

conf_folds_en$class <- str_replace_all(conf_folds_en$class, "Class: ", "")

conf_folds_en$classifier <- factor(conf_folds_en$classifier,
                                   levels = c("Naive Bayes", "SVM", "MLP"))

conf_folds_en$F1[is.nan(conf_folds_en$F1)] <- NA

# calculate mean, minimum, maximum

conf_folds_long_en <- conf_folds_en %>% 
  select(class, classifier, F1, Precision, Recall) %>% 
  gather(metric, value, -c(class, classifier))

# summary statistics of overall mean by classifier
conf_folds_long_en %>% 
  group_by(classifier) %>% 
  summarise(mean_total = mean(value))


# calculate mean, maximum and minimum scores for each classifier, class, and metric
conf_folds_long_sum_en <- conf_folds_long_en %>% 
  group_by(class, classifier, metric) %>% 
  summarise(mean = mean(value, na.rm = TRUE),
            max = max(value, na.rm = TRUE),
            min = min(value, na.rm = TRUE))


# get average scores for SVM classifier (reported in paper)

# get average scores for SVM classifier (reported in paper)
conf_folds_long_sum_en %>% 
    filter(classifier == "SVM") %>% 
    filter(metric == "F1")


# change factor levels
conf_folds_long_sum_en$class <- factor(conf_folds_long_sum_en$class,
                                       levels = c("Future", "Present", "Past"))

# Figure A04 ----
ggplot(conf_folds_long_sum_en, aes(x = classifier, y = mean,
                                   ymin = min, ymax = max,
                                   colour = classifier,
                                   shape = classifier)) +
  geom_pointrange(position = position_dodge(width = 0.8)) +
  facet_grid(metric~class) +
  scale_y_continuous(limits = c(0.3, 1), breaks = c(seq(0.3, 1, 0.2))) +
  scale_colour_manual(values = c("darkred", "darkgreen", "black")) +
  scale_shape_manual(values = c(1, 17, 16)) +
  coord_flip() +
  labs(x = NULL, y = "Performance") +
  theme(legend.position = "none")
ggsave("fga04.pdf",
       width = 10, height = 5)


# k-fold cross-validation for German corpus

                              # create an ID for each sentences
dat_full_de$sentence_id <- 1:nrow(dat_full_de)

# construct a text corpus
corp_de <- corpus(dat_full_de, text_field = "text")

# reshuffle corpus
set.seed(135)
corp_de <- corpus_sample(corp_de,
                         size = NULL,
                         replace = FALSE,
                         prob = NULL,
                         by = NULL)

# tokenize corpus
toks_de <- tokens(corp_de) 

                              # create 4 equally size folds
docvars(toks_de, "fold") <- cut(seq(1, ndoc(toks_de)), breaks = 4, labels = FALSE)


conf_folds_de <- data.frame()

# perform 4-fold cross validation
for(i in 1:4){
  
  dfmat_train <- toks_de %>% 
    tokens_subset(fold != i) %>% 
    dfm(remove_punct = TRUE)
  
  
  dfmat_test <- toks_de %>% 
    tokens_subset(fold == i) %>% 
    dfm(remove_punct = TRUE)
  
  # match features of dfm
  dfmat_train <- dfm_match(dfmat_train, featnames(dfmat_test))
  dfmat_test <- dfm_match(dfmat_test, featnames(dfmat_train))
  
  # train and predict all models
  tmod_nb <- textmodel_nb(dfmat_train, docvars(dfmat_train, "class"))
  tmod_mlp <- quanteda.classifiers::textmodel_mlp(dfmat_train, docvars(dfmat_train, "class")) # might change since a development version is used
  tmod_svm <- textmodel_svm(dfmat_train, docvars(dfmat_train, "class"))
  
  pred_tmod_nb <- predict(tmod_nb, dfmat_test, force = TRUE)
  pred_tmod_mlp <- predict(tmod_mlp, dfmat_test, force = TRUE)
  pred_tmod_svm <- predict(tmod_svm, dfmat_test, force = TRUE)
  
                              # create cross-tabulations for performance
  tab_nb <- (table(pred_tmod_nb,
                   docvars(dfmat_test, "class")))
  
  if(nrow(tab_nb)!=ncol(tab_nb)){
    
    missings <- setdiff(colnames(tab_nb),rownames(tab_nb))
    
    missing_mat <- mat.or.vec(nr = length(missings), nc = ncol(tab_nb))
    tab_nb  <- as.table(rbind(as.matrix(tab_nb), missing_mat))
    rownames(tab_nb) <- colnames(tab_nb)
  }
  
  tab_nb <- caret::confusionMatrix(tab_nb)
  
  tab_mlp <- table(pred_tmod_mlp,
                   docvars(dfmat_test, "class"))
  
  if(nrow(tab_mlp)!=ncol(tab_mlp)){
    
    missings <- setdiff(colnames(tab_mlp),rownames(tab_mlp))
    
    missing_mat <- mat.or.vec(nr = length(missings), nc = ncol(tab_mlp))
    tab_mlp  <- as.table(rbind(as.matrix(tab_mlp), missing_mat))
    rownames(tab_mlp) <- colnames(tab_mlp)
  }
  
  tab_mlp <- caret::confusionMatrix(tab_mlp)
  
  
  tab_svm <- table(pred_tmod_svm,
                   docvars(dfmat_test, "class"))
  
  
  if(nrow(tab_svm)!=ncol(tab_svm)){
    
    missings <- setdiff(colnames(tab_svm),rownames(tab_svm))
    
    missing_mat <- mat.or.vec(nr = length(missings), nc = ncol(tab_svm))
    tab_svm  <- as.table(rbind(as.matrix(tab_svm), missing_mat))
    rownames(tab_svm) <- colnames(tab_svm)
  }
  
  tab_svm <- caret::confusionMatrix(tab_svm)
  
  conf_nb <- as.data.frame(tab_nb$byClass[, "F1"])
  conf_prec_nb <- as.data.frame(tab_nb$byClass[, "Precision"])
  conf_rec_nb <- as.data.frame(tab_nb$byClass[, "Recall"])
  
  conf_nb$class <- rownames(conf_nb)
  conf_nb$F1 <- conf_nb$`tab_nb$byClass[, "F1"]`
  conf_nb$Precision <- conf_prec_nb$`tab_nb$byClass[, "Precision"]`
  conf_nb$Recall <- conf_rec_nb$`tab_nb$byClass[, "Recall"]`
  
  conf_nb$classifier <- "Naive Bayes"
  
  conf_svm <- as.data.frame(tab_svm$byClass[, "F1"])
  conf_prec_svm <- as.data.frame(tab_svm$byClass[, "Precision"])
  conf_rec_svm <- as.data.frame(tab_svm$byClass[, "Recall"])
  
  conf_svm$class <- rownames(conf_svm)
  conf_svm$F1 <- conf_svm$`tab_svm$byClass[, "F1"]`
  conf_svm$Precision <- conf_prec_svm$`tab_svm$byClass[, "Precision"]`
  conf_svm$Recall <- conf_rec_svm$`tab_svm$byClass[, "Recall"]`
  
  conf_svm$classifier <- "SVM"
  
  conf_mlp <- as.data.frame(tab_mlp$byClass[, "F1"])
  conf_prec_mlp <- as.data.frame(tab_mlp$byClass[, "Precision"])
  conf_rec_mlp <- as.data.frame(tab_mlp$byClass[, "Recall"])
  
  conf_mlp$class <- rownames(conf_mlp)
  conf_mlp$F1 <- conf_mlp$`tab_mlp$byClass[, "F1"]`
  conf_mlp$Precision <- conf_prec_mlp$`tab_mlp$byClass[, "Precision"]`
  conf_mlp$Recall <- conf_rec_mlp$`tab_mlp$byClass[, "Recall"]`
  
  conf_mlp$classifier <- "MLP"
  
  conf <- bind_rows(conf_nb,
                    conf_svm,
                    conf_mlp)
  
  conf$fold <- paste0("Fold: ", i)
  
  conf_folds_de <- bind_rows(conf, conf_folds_de)
}


conf_folds_de$class <- str_replace_all(conf_folds_de$class, "Class: ", "")

conf_folds_de$classifier <- factor(conf_folds_de$classifier,
                                   levels = c("Naive Bayes", "SVM", "MLP"))

conf_folds_de$F1[is.nan(conf_folds_de$F1)] <- NA

# calculate mean, minimum, maximum
conf_folds_long_de <- conf_folds_de %>% 
  select(class, classifier, F1, Precision, Recall) %>% 
  gather(metric, value, -c(class, classifier))

# calculate mean, maximum and minimum scores for each classifier, class, and metric
conf_folds_long_sum_de <- conf_folds_long_de %>% 
  ungroup() %>% 
  group_by(class, classifier, metric) %>% 
  summarise(mean = mean(value, na.rm = TRUE),
            max = max(value, na.rm = TRUE),
            min = min(value, na.rm = TRUE))


# get average scores for SVM classifier (reported in paper)
conf_folds_long_sum_de %>% 
    filter(classifier == "SVM") %>% 
    filter(metric == "F1")

# change factor levels
conf_folds_long_sum_de$class <- factor(conf_folds_long_sum_de$class,
                                       levels = c("Future", "Present", "Past"))

# Figure A05 ----
ggplot(conf_folds_long_sum_de, aes(x = classifier, y = mean,
                                   ymin = min, ymax = max, 
                                   shape = classifier, 
                                   colour = classifier)) +
  geom_pointrange(position = position_dodge(width = 0.8)) +
  scale_y_continuous(limits = c(0.3, 1), breaks = c(seq(0.3, 1, 0.2))) +
  facet_grid(metric~class) +
  scale_colour_manual(values = c("darkred", "darkgreen", "black")) +
  scale_shape_manual(values = c(1, 17, 16)) +
  coord_flip() +
  labs(x = NULL, y = "Performance") +
  theme(legend.position = "none")
ggsave("fga05.pdf",
       width = 10, height = 5)


# split up English sentences into 70% for training set and 30% for test set

dat_english <- dat_full_en
dat_english$id <- 1:nrow(dat_english)

english_70 <- round(nrow(dat_english) * 0.7, 0)
english_30 <- round(nrow(dat_english) * 0.3, 0)

english_70
english_30

set.seed(123)
sentences_sampled_en <- sample(1:nrow(dat_english), size = english_70)


                              # create training and test sets
dat_train_en <- dat_english %>% 
  filter(id %in% sentences_sampled_en)

dat_test_en <- dat_english %>% 
  filter(!id %in% sentences_sampled_en)

dfmat_train_en <- dat_train_en %>% 
  corpus() %>% 
  tokens() %>% 
  dfm(remove_punct = TRUE)

dfmat_test_en <- dat_test_en %>% 
  corpus() %>% 
  tokens() %>% 
  dfm(remove_punct = TRUE)

# train SVM
tmod_svm_en <- textmodel_svm(dfmat_train_en, docvars(dfmat_train_en, "class"))

# predict labels in test set
dat_test_en$class_svm <- predict(tmod_svm_en, dfmat_test_en, force = TRUE)

# confusion matrix
tab_en <- table(Predicted = paste0("Predicted: ", dat_test_en$class_svm),
                Actual = dat_test_en$class)


# Table A01 ----
tab_en

xtable::print.xtable(xtable(tab_en,
                            caption = "Confusion matrix (English, SVM)",
                            label="tab:confusion_en"),
                     caption.placement = "top",
                     size = "footnotesize",
                     file = "taba01.tex")


# repeat analysis for German data

dat_german <- dat_full_de
dat_german$id <- 1:nrow(dat_german)

german_70 <- round(nrow(dat_german) * 0.7, 0)
german_30 <- round(nrow(dat_german) * 0.3, 0)

german_70
german_30

set.seed(123)
sentences_sampled_ger <- sample(1:nrow(dat_german), size = german_70)

dat_train_ger <- dat_german %>% 
  filter(id %in% sentences_sampled_ger)

dat_test_ger <- dat_german %>% 
  filter(!id %in% sentences_sampled_ger)

dfmat_train_ger <- dat_train_ger %>% 
  corpus() %>% 
  tokens() %>% 
  dfm(remove_punct = TRUE)

dfmat_test_ger<- dat_test_ger %>% 
  corpus() %>% 
  tokens() %>% 
  dfm(remove_punct = TRUE)


tmod_svm_ger <- textmodel_svm(dfmat_train_ger, docvars(dfmat_train_ger, "class"))

dat_test_ger$class_svm <- predict(tmod_svm_ger, dfmat_test_ger, force = TRUE)

tab_de <- table(Predicted = paste0("Predicted: ", dat_test_ger$class_svm),
             Actual = dat_test_ger$class)


# Table A02 ----
tab_de

xtable::print.xtable(xtable(tab_de,
                            caption = "Confusion matrix (German, SVM)",
                            label="tab:confusion_ger"),
                     caption.placement = "top",
                     size = "footnotesize",
                     file = "taba02.tex")



# load German sentiment dictionary
# (cannot be included in the Dataverse replication material for copyright reasons)
liwc_german <- dictionary(file = "../data_notshare/LIWC2001_German_UTF8.dic")
liwc_german <- liwc_german[c("Posemo", "Negemo")]

                              # create corpus of German sentences
corp_austria <- corpus(dat_full_de)
ndoc(corp_austria)

# apply German dictionary to tokens object (to score multiword expressions correctly)
dict_results_liwc <- corp_austria %>% 
  tokens() %>% 
  tokens_lookup(liwc_german, nested_scope = "dictionary") %>% 
  dfm() %>% 
  convert(to = "data.frame") 

# count number of tokens
dict_results_liwc$ntoken <- ntoken(corp_austria, remove_punct = TRUE)

# estimate sentiment (using the aggregation formula by Crabtree et al.)
dict_results_liwc <- dict_results_liwc %>% 
  mutate(sentiment = 100 * (posemo - negemo) / ntoken)

# bind dictionary analsis with full dataset of sentences
dat_combined <- bind_cols(dict_results_liwc, dat_full_de) 

                              # create an ID for each manifesto
dat_combined <- dat_combined %>% 
  mutate(manifesto_id = paste(party, year))


dat_combined$id <- 1:nrow(dat_combined)

set.seed(123)
sentences_sampled <- sample(1:nrow(dat_combined), size = 5000)

                              # create test and training sets
dat_train <- dat_combined %>% 
  filter(id %in% sentences_sampled)

dat_test <- dat_combined %>% 
  filter(!id %in% sentences_sampled)

dfmat_train <- dat_train %>% 
  corpus() %>% 
  tokens() %>% 
  dfm(remove_punct = TRUE)

dfmat_test <- dat_test %>% 
  corpus() %>% 
  tokens() %>% 
  dfm(remove_punct = TRUE)

# train the three classifiers
tmod_nb <- textmodel_nb(dfmat_train, docvars(dfmat_train, "class"))
tmod_mlp <- quanteda.classifiers::textmodel_mlp(dfmat_train, docvars(dfmat_train, "class")) # might change since a development version is used
tmod_svm <- textmodel_svm(dfmat_train, docvars(dfmat_train, "class"))

# predict classes for test set
dat_test$class_nb <- predict(tmod_nb, dfmat_test, force = TRUE)
dat_test$class_svm <- predict(tmod_svm, dfmat_test, force = TRUE)
dat_test$class_mlp <- predict(tmod_mlp, dfmat_test, force = TRUE)

# summarise the absolute and relative frequency per class,
# separately for each classifier and the human classification of sentences
dat_prop_nb <- dat_test %>% 
  group_by(manifesto_id, class_nb) %>% 
  summarise(n = n()) %>%
  mutate(freq = n / sum(n)) %>% 
  mutate(type = "Supervised classification: Naive Bayes") %>% 
  rename(class = class_nb)

dat_prop_svm <- dat_test %>% 
  group_by(manifesto_id, class_svm) %>% 
  summarise(n = n()) %>%
  mutate(freq = n / sum(n)) %>% 
  mutate(type = "Supervised classification: SVM") %>% 
  rename(class = class_svm)

dat_prop_mlp <- dat_test %>% 
  group_by(manifesto_id, class_mlp) %>% 
  summarise(n = n()) %>%
  mutate(freq = n / sum(n)) %>% 
  mutate(type = "Supervised classification: MLP") %>% 
  rename(class = class_mlp)

dat_prop_handcoded <- dat_test %>% 
  group_by(manifesto_id, class) %>% 
  summarise(n = n()) %>%
  mutate(freq = n / sum(n)) %>% 
  mutate(type = "Human coding") 

dat_prop_handcoded_compare <- dat_prop_handcoded %>% 
  rename(n_handcoded = n,
         freq_handcoded = freq) %>% 
  select(-type)

dat_prop_compare <- bind_rows(dat_prop_svm,
                              dat_prop_nb,
                              dat_prop_mlp) %>% 
  left_join(dat_prop_handcoded_compare) %>% 
  mutate(type = str_replace_all(type, "Supervised classification: ", ""))


dat_prop_compare_cors <- dat_prop_compare %>% 
  group_by(type) %>% 
  summarise(cor = cor(freq, freq_handcoded, use = "pairwise.complete")) 

dat_prop_compare <- left_join(dat_prop_compare, dat_prop_compare_cors)

dat_prop_compare_cors_plot <- dat_prop_compare %>% 
  mutate(type_cor = paste0(type, " (r=", round(cor, 2), ")"))

# Figure A06 ----
ggplot(dat_prop_compare_cors_plot, aes(x = freq, 
                                       y = freq_handcoded)) +
  geom_point(alpha = 0.7, size = 4,
             aes(colour = class,
                 shape = class)) +
  facet_wrap(~type_cor) +
  scale_shape_manual(values = c(1, 16, 17)) +
  scale_colour_manual(values = c("black", "grey50", "grey30")) +
  geom_smooth(method = "lm", colour = "black") +
  labs(x = "Proportion of manifesto (based on classifier)",
       y = "Proportion of manifesto\n(based on human coding)") +
  theme(legend.title = element_blank())
ggsave("fga06.pdf",
       width = 10, height = 5)

# now bootstrap the average sentiment in the test set 
# for the three classifiers and the human codings
dict_results_boot_handcoded <- dat_test %>%
  group_by(class) %>%
  do(data.frame(rbind(Hmisc::smean.cl.boot(.$sentiment)))) %>% 
  mutate(type = "Human coding")

dict_results_boot_nb <- dat_test %>%
  group_by(class_nb) %>%
  do(data.frame(rbind(Hmisc::smean.cl.boot(.$sentiment)))) %>% 
  mutate(type = "Supervised classification: Naive Bayes") %>% 
  rename(class = class_nb) 

dict_results_boot_svm <- dat_test %>%
  group_by(class_svm) %>%
  do(data.frame(rbind(Hmisc::smean.cl.boot(.$sentiment)))) %>% 
  mutate(type = "Supervised classification: SVM") %>% 
  rename(class = class_svm) 

dict_results_boot_mlp <- dat_test %>%
  group_by(class_mlp) %>%
  do(data.frame(rbind(Hmisc::smean.cl.boot(.$sentiment)))) %>% 
  mutate(type = "Supervised classification: MLP") %>% 
  rename(class = class_mlp) 

# bind bootstrapped estimates into a single data frame
dict_results_merged <- bind_rows(dict_results_boot_handcoded,
                                 dict_results_boot_nb,
                                 dict_results_boot_mlp,
                                 dict_results_boot_svm)


dict_results_merged$class <- factor(dict_results_merged$class,
                                    levels = c("Past", "Present", 
                                               "Future"))

dict_results_merged <- dict_results_merged %>% 
  mutate(handcoded_dummy = ifelse(str_detect(type, "Human coding"), TRUE, FALSE))

# Figure A07 ----
ggplot(dict_results_merged, aes(x = class, y = Mean, ymin = Lower, 
                                ymax = Upper, shape = type, colour = type)) +
  geom_pointrange(position = position_dodge(width = 0.4), size = 0.8) +
  scale_shape_manual(values = c(16, 2, 1, 5)) +
  scale_colour_manual(values = c("red", "black", "black", "black")) +
  theme(legend.position = "right", legend.title = element_blank()) +
  labs(x = NULL, y = "Sentiment in held-out sentences\n(and 95% bootstrapped CIs)")
ggsave("fga07.pdf",
       width = 10, height = 3.5)



dat_combined_all <- readRDS("data_manifestos_classified.rds")

# remove all sentences with more than 99 tokens
dat_combined <- filter(dat_combined_all, ntoken < 100)

dat_combined$year <- as.numeric(dat_combined$year)
dat_combined$class <- factor(dat_combined$class)

# keyness for negative terms and government status
dat_keyness_sentiment_en <- data.frame()
class <- unique(dat_combined$class)


# calculate keyness by language and class

# get English texts and group by class
dfmat_grouped_en <- dat_combined %>% 
    filter(language == "english") %>% 
    corpus() %>% 
    dfm(remove_numbers = TRUE) %>% 
    dfm_keep(min_nchar = 2) %>% # only keep words with at least two characters
    dfm_group(groups = "class") 


df_keyness_en <- data.frame()
for (g in docnames(dfmat_grouped_en)) {
    df_temp <- head(textstat_keyness(dfmat_grouped_en, target = g), 20)
    df_temp[["target"]] <- g
    df_keyness_en <- rbind(df_keyness_en, df_temp)
}

# relevel factors
df_keyness_en$target <- factor(df_keyness_en$target,
                               levels = c("Past", "Present", "Future"))

# Figure A08 ----
ggplot(data = df_keyness_en, aes(x = factor(nrow(df_keyness_en):1), y = chi2)) +
    geom_bar(width = 0.05, stat = "identity") +
    geom_point(size = 2) +
    facet_wrap(~target, scales = "free", nrow = 1) +
    coord_flip() +
    scale_x_discrete(breaks = nrow(df_keyness_en):1,
                     labels = df_keyness_en$feature) +
    labs(x = NULL, y = "Keyness statistics") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 10))
ggsave("fga08.pdf", width = 10, height = 5)


# keyness statistics for German per class
dfmat_grouped_de <- dat_combined %>% 
    filter(language == "german") %>% 
    corpus() %>% 
    dfm(remove_numbers = TRUE) %>% 
    dfm_keep(min_nchar = 2) %>% 
    dfm_group(groups = "class") 


df_keyness_de <- data.frame()
for (g in docnames(dfmat_grouped_de)) {
    df_temp <- head(textstat_keyness(dfmat_grouped_de, target = g), 20)
    df_temp[["target"]] <- g
    df_keyness_de <- rbind(df_keyness_de, df_temp)
}

# relevel factors
df_keyness_de$target <- factor(df_keyness_de$target,
                               levels = c("Past", "Present", "Future"))

# Figure A09 ----
ggplot(data = df_keyness_de, aes(x = factor(nrow(df_keyness_de):1), y = chi2)) +
    geom_bar(width = 0.05, stat = "identity") +
    geom_point(size = 2) +
    facet_wrap(~target, scales = "free") +
    coord_flip() +
    scale_x_discrete(breaks = nrow(df_keyness_de):1,
                     labels = df_keyness_de$feature) +
    labs(x = NULL, y = "Keyness statistics") +
    theme(axis.text.x = element_text(angle = 90,  size = 10))
ggsave("fga09.pdf", width = 10, height = 5)


# repeat the same keyness analysis but do it separately for each country
countrynames <- unique(dat_combined$countryname)
df_keyness_countries <- data.frame()
for (i in countrynames) {
    dfmat_grouped_country <- dat_combined %>% 
        filter(countryname == i) %>% 
        corpus() %>% 
        dfm(remove_numbers = TRUE) %>% 
        dfm_keep(min_nchar = 2) %>% 
        dfm_group(groups = "class") 
    
    df_keyness_country <- data.frame()
    for (g in docnames(dfmat_grouped_country)) {
        df_temp <- head(textstat_keyness(dfmat_grouped_country, target = g), 10)
        df_temp[["target"]] <- g
        df_keyness_country <- rbind(df_keyness_country, df_temp)
    }
    
    df_keyness_country$countryname <- i
    
    df_keyness_countries <- bind_rows(df_keyness_countries, df_keyness_country)
    
}

# relevel factor
df_keyness_countries$target <- factor(df_keyness_countries$target,
                                      levels = c("Past", "Present", "Future"))


# loop through this data frame and create three separate plots (one for each time perspective)
classes_unique <-  c("Past", "Present", "Future")

for (i in classes_unique) {
    df_keyness_countries_class <- filter(df_keyness_countries, target == i)
    
    ggplot(data = df_keyness_countries_class, 
           aes(x = factor(nrow(df_keyness_countries_class):1), y = chi2)) +
        geom_bar(width = 0.05, stat = "identity") +
        geom_point(size = 2) +
        facet_wrap(~countryname, scales = "free") +
        coord_flip() +
        scale_x_discrete(breaks = nrow(df_keyness_countries_class):1,
                         labels = df_keyness_countries_class$feature) +
        labs(x = NULL, y = "Keyness statistics") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 10))
    ggsave(paste0("fga10_", str_to_lower(i), ".pdf"), width = 10, height = 10)
}



# get representative sentences from each temporal perspective
# by filtering the sentences with the highest probabilities for each class

classes <- c("Past", "Present", "Future")

# English 
dat_sentences_en <- data.frame()

for (i in classes) {
    
    dat_ordered <- dat_combined %>% 
        filter(language == "english" & annotations == "FALSE") %>% 
        filter(class == i) %>% 
        filter(ntoken > 10) %>%
        filter(ntoken < 30) %>% 
        group_by(language_capital, class) %>% 
        arrange(-class_probability) %>% 
        ungroup() %>% 
        select(Language = language_capital, 
               Class = class, 
               Sentence = text, 
               `Posterior prob.` = class_probability)
    
    dat_top <- dat_ordered[1:5, ] 
    
    dat_sentences_en <- bind_rows(dat_top, dat_sentences_en)
}


# German
dat_sentences_de <- data.frame()

for (i in classes) {
    
    dat_ordered <- dat_combined %>% 
        filter(language == "german" & annotations == "FALSE") %>% 
        filter(class == i) %>% 
        filter(ntoken > 10) %>%
        filter(ntoken < 30) %>% 
        group_by(language_capital, class) %>% 
        arrange(-class_probability) %>% 
        ungroup() %>% 
        select(Language = language_capital, 
               Class = class, 
               Sentence = text, 
               `Posterior prob.` = class_probability)
    
    dat_top <- dat_ordered[1:5, ] 
    
    dat_sentences_de <- bind_rows(dat_top, dat_sentences_de)
}


# necessary function to create table
addtorow <-  list()
addtorow$pos <- list()
addtorow$pos[[1]] <- c(0)
addtorow$command <- c(paste("\\hline \n",
                            "\\endhead \n",
                            "\\hline \n",
                            "\\endfoot \n",
                            "\\endlastfoot \n", sep = ""))



# replace some characters
dat_sentences_en <- dat_sentences_en %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\$", " US-Dollars")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\%", " percent")) %>% 
    arrange(Language, Class) %>% 
    select(-c(Language))

dat_sentences_de <- dat_sentences_de %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\$", " US-Dollar")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\%", " Prozent")) %>% 
    arrange(Language, Class) %>% 
    select(-c(Language))


# Table A03 ----
xtable::print.xtable(xtable(dat_sentences_en, 
                            caption="The 5 English natural sentences with the highest posterior probabilities for each class and language",
                            label="tab:example_sentences_en",
                            tabular.environment = 'longtable',
                            align= c("p{0.1\\textwidth}", 
                                     "p{0.1\\textwidth}", 
                                     "p{0.75\\textwidth}",
                                     "p{0.1\\textwidth}")),
                     type = "latex",
                     format.args = list(big.mark = ","),
                     size = "scriptsize",
                     file = "taba03.tex",
                     include.rownames = FALSE,
                     floating = FALSE,
                     tabular.environment = 'longtable',
                     sanitize.text.function = function(x){x},
                     add.to.row = addtorow,
                     hline.after = c(-1),
                     caption.placement="top")


# Table A04 ----
xtable::print.xtable(xtable(dat_sentences_de, 
                            caption="The 5 German natural sentences with the highest posterior probabilities for each class and language",
                            label="tab:example_sentences_de",
                            tabular.environment = 'longtable',
                            align= c("p{0.1\\textwidth}", 
                                     "p{0.1\\textwidth}", 
                                     "p{0.75\\textwidth}",
                                     "p{0.1\\textwidth}")),
                     type = "latex",
                     format.args = list(big.mark = ","),
                     size = "scriptsize",
                     file = "taba04.tex",
                     include.rownames = FALSE,
                     floating = FALSE,
                     tabular.environment = 'longtable',
                     sanitize.text.function = function(x){x},
                     add.to.row = addtorow,
                     hline.after = c(-1),
                     caption.placement="top")



# get a random selection of sentences from each class (with probability > 0.75)

# English
dat_sentences_en_random <- data.frame()

for (i in classes) {
    
    set.seed(21)
    dat_en_random <- dat_combined %>% 
        filter(language == "english" & annotations == "FALSE") %>% 
        filter(class == i) %>% 
        filter(class_probability > 0.75) %>% 
        filter(ntoken > 10) %>%
        filter(ntoken < 30) %>% 
        group_by(language_capital, class) %>% 
        sample_n(size = 5, replace = FALSE) %>% 
        ungroup() %>% 
        select(Class = class, 
               Sentence = text, 
               `Posterior prob.` = class_probability)
    
    dat_sentences_en_random <- bind_rows(dat_en_random, dat_sentences_en_random)
}


# German
dat_sentences_de_random <- data.frame()

for (i in classes) {
    
    set.seed(21)
    dat_de_random <- dat_combined %>% 
        filter(language == "german" & annotations == "FALSE") %>% 
        filter(class == i) %>% 
        filter(class_probability > 0.75) %>% 
        filter(ntoken > 10) %>%
        filter(ntoken < 30) %>% 
        group_by(language_capital, class) %>% 
        sample_n(size = 5, replace = FALSE) %>% 
        ungroup() %>% 
        select(Class = class, 
               Sentence = text, 
               `Posterior prob.` = class_probability)
    
    dat_sentences_de_random <- bind_rows(dat_de_random, dat_sentences_de_random)
}

# replace some characters
dat_sentences_de_random <- dat_sentences_de_random %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\$", " US-Dollar ")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "€", " Euro ")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\%", " Prozent ")) %>%
    mutate(Sentence = str_replace_all(Sentence, "\\&", " und ")) 

dat_sentences_en_random <- dat_sentences_en_random %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\$", " US-Dollars")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "€", " euros ")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\%", " percent")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\&", " and")) 


xtable::print.xtable(xtable(dat_sentences_en_random, 
                            caption="5 randomly selected English natural sentences per class",
                            label="tab:example_sentences_en_random",
                            tabular.environment = 'longtable',
                            align= c("p{0.1\\textwidth}", 
                                     "p{0.1\\textwidth}", 
                                     "p{0.75\\textwidth}",
                                     "p{0.1\\textwidth}")),
                     type = "latex",
                     format.args = list(big.mark = ","),
                     size = "scriptsize",
                     file = "taba05.tex",
                     include.rownames = FALSE,
                     floating = FALSE,
                     tabular.environment = 'longtable',
                     sanitize.text.function = function(x){x},
                     add.to.row = addtorow,
                     hline.after = c(-1),
                     caption.placement="top")



xtable::print.xtable(xtable(dat_sentences_de_random, 
                            caption="5 randomly selected German natural sentences per class",
                            label="tab:example_sentences_de_random",
                            tabular.environment = 'longtable',
                            align= c("p{0.1\\textwidth}", 
                                     "p{0.1\\textwidth}", 
                                     "p{0.75\\textwidth}",
                                     "p{0.1\\textwidth}")),
                     type = "latex",
                     format.args = list(big.mark = ","),
                     size = "scriptsize",
                     file = "taba06.tex",
                     include.rownames = FALSE,
                     floating = FALSE,
                     tabular.environment = 'longtable',
                     sanitize.text.function = function(x){x},
                     add.to.row = addtorow,
                     hline.after = c(-1),
                     caption.placement="top")



# now get sentences with very positive or negative levels 
# of sentiment in each class and for the two language

# estimate sentence-level sentiment (using the LSD scores)
dat_combined <- dat_combined %>% 
    group_by(manifesto_id, class) %>% 
    mutate(sentiment_sentence_lsd = 100 * ((positive_lsd - negative_lsd) / ntoken)) %>% 
    ungroup()

classes <- c("Past", "Present", "Future")

# English (positive)
dat_sentences_sentiment_en_pos <- data.frame()

for (i in classes) {
    
    dat_ordered <- dat_combined %>% 
        filter(language == "english" & annotations == "FALSE") %>% 
        filter(class == i) %>% 
        filter(class_probability > 0.8) %>% 
        filter(ntoken > 10) %>%
        filter(ntoken < 30) %>% 
        group_by(incumbency_status2_factor, class) %>% 
        arrange(-sentiment_sentence_lsd) %>% 
        # ungroup() %>% 
        select(Class = class, 
               `Posterior prob.` = class_probability,
               Sentence = text, 
               Incumbency = incumbency_status2_factor,
               Sentiment = sentiment_sentence_lsd)
    
    dat_top_sentiment <- top_n(dat_ordered, 5, wt = Sentiment)
    
    dat_sentences_sentiment_en_pos <- bind_rows(dat_top_sentiment, dat_sentences_sentiment_en_pos)
}


# English (negative)
dat_sentences_sentiment_en_neg <- data.frame()

for (i in classes) {
    
    dat_ordered <- dat_combined %>% 
        filter(language == "english" & annotations == "FALSE") %>% 
        filter(class == i) %>% 
        filter(class_probability > 0.8) %>% 
        filter(ntoken > 15) %>%
        filter(ntoken < 30) %>% 
        group_by(incumbency_status2_factor, class) %>% 
        arrange(sentiment_sentence_lsd) %>% 
        #ungroup() %>% 
        select(Class = class, 
               `Posterior prob.` = class_probability,
               Sentence = text, 
               Incumbency = incumbency_status2_factor,
               Sentiment = sentiment_sentence_lsd)
    
    dat_top_sentiment <- top_n(dat_ordered, 5, wt = -Sentiment)
    
    dat_sentences_sentiment_en_neg <- bind_rows(dat_top_sentiment, dat_sentences_sentiment_en_neg)
}


# replace some characters
dat_sentences_sentiment_en_pos <- dat_sentences_sentiment_en_pos %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\$", " US-Dollars")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "€", " euros ")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\%", " percent")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\&", " and")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "•", "")) %>% 
    arrange(Class, Incumbency, -Sentiment)

# Table A07 ----
xtable::print.xtable(xtable(dat_sentences_sentiment_en_pos, 
                            caption="The 5 most positive English natural sentences by incumbents and opposition parties (according to the Lexicoder Sentiment Dictionary) per class",
                            label="tab:example_sentences_en_positive",
                            align= c("p{0.1\\textwidth}", 
                                     "p{0.05\\textwidth}", 
                                     "p{0.07\\textwidth}",
                                     "p{0.6\\textwidth}", 
                                     "p{0.1\\textwidth}",
                                     "p{0.07\\textwidth}")),
                     type = "latex",
                     format.args = list(big.mark = ","),
                     size = "scriptsize",
                     file = "taba07.tex",
                     include.rownames = FALSE,
                     floating = FALSE,
                     tabular.environment = 'longtable',
                     sanitize.text.function = function(x){x},
                     add.to.row = addtorow,
                     hline.after = c(-1),
                     caption.placement = "top")


# replace some characters
dat_sentences_sentiment_en_neg <- dat_sentences_sentiment_en_neg %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\$", " US-Dollars")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "€", " euros ")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\%", " percent")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "\\&", " and")) %>% 
    mutate(Sentence = str_replace_all(Sentence, "•", "")) %>% 
    arrange(Class, Incumbency, Sentiment)

# Table A08 ----
xtable::print.xtable(xtable(dat_sentences_sentiment_en_neg, 
                            caption="The 5 most English negative natural sentences by incumbents and opposition parties (according to the Lexicoder Sentiment Dictionary) per class",
                            label="tab:example_sentences_en_negative",
                            tabular.environment = 'longtable',
                            align= c("p{0.1\\textwidth}", 
                                     "p{0.05\\textwidth}", 
                                     "p{0.07\\textwidth}",
                                     "p{0.6\\textwidth}", 
                                     "p{0.1\\textwidth}",
                                     "p{0.07\\textwidth}")),
                     type = "latex",
                     format.args = list(big.mark = ","),
                     size = "scriptsize",
                     file = "taba08.tex",
                     include.rownames = FALSE,
                     floating = FALSE,
                     tabular.environment = 'longtable',
                     sanitize.text.function = function(x){x},
                     add.to.row = addtorow,
                     hline.after = c(-1),
                     caption.placement="top")

